How to do it?:
Open the Rmarkdown file of this assignment (link) in Rstudio.
Right under each question, insert a code chunk (you can use the hotkey Ctrl + Alt + I to add a code chunk) and code the solution for the question.
Knit the rmarkdown file (hotkey: Ctrl + Alt + K) to export an html.
Publish the html file to your Githiub Page.
Submission: Submit the link on Github of the assignment to Blackboard.
Input: a data frame
Output: a data frame with all the missing of numeric variables replaced by the associated means.
Hint: Similar function
options(warn=-1)
library(tidyverse)
df = read_csv('titanic.csv')
sort(colSums(is.na(df)),decreasing = TRUE)
## Cabin Age Embarked PassengerId Survived Pclass
## 687 177 2 0 0 0
## Name Sex SibSp Parch Ticket Fare
## 0 0 0 0 0 0
mean_impute <- function(x)
{
#Make sure x has missing values
if(sum(is.na(x))>0)
{
# Find the mean of x
mean_of_x <- mean(x, na.rm = TRUE)
# Replace the missing by the mean
library(tidyr)
x <- replace_na(x, mean_of_x)
}
return(x)
}
numeric_impute <- function(d)
{
for (i in 1:length(d))
{
d[[i]] <- mean_impute(d[[i]])
}
return(d)
}
df = numeric_impute(df)
sort(colSums(is.na(df)),decreasing = TRUE)
## Cabin Embarked PassengerId Survived Pclass Name
## 687 2 0 0 0 0
## Sex Age SibSp Parch Ticket Fare
## 0 0 0 0 0 0
Input: a data frame
Output: a data frame with all the missing of variables replaced by the associated means (for numeric variables) or modes (for non-numeric variables).
Hint: Combine the function in Problem 1 and the function in this example
df = read_csv('titanic.csv')
mean_impute <- function(x)
{
#Make sure x has missing values
if(sum(is.na(x))>0)
{
# Find the mean of x
mean_of_x <- mean(x, na.rm = TRUE)
# Replace the missing by the mean
library(tidyr)
x <- replace_na(x, mean_of_x)
}
return(x)
}
mode_impute <- function(x)
{
if(!is.numeric(x))
{
# Find the mode of x
mode_of_x <- names(sort(-table(x)))[1]
# Replace the missing by the mode
library(tidyr)
x <- replace_na(x, mode_of_x)
}
return(x)
}
numeric_impute <- function(d)
{
for (i in 1:length(d))
{
d[[i]] <- mean_impute(d[[i]])
}
for (i in 1:length(d))
{
d[[i]] <- mode_impute(d[[i]])
}
return(d)
}
sort(colSums(is.na(df)),decreasing = TRUE)
## Cabin Age Embarked PassengerId Survived Pclass
## 687 177 2 0 0 0
## Name Sex SibSp Parch Ticket Fare
## 0 0 0 0 0 0
df = numeric_impute(df)
sort(colSums(is.na(df)),decreasing = TRUE)
## PassengerId Survived Pclass Name Sex Age
## 0 0 0 0 0 0
## SibSp Parch Ticket Fare Cabin Embarked
## 0 0 0 0 0 0
Input: a data frame
Output: Bar plots of all non-numeric variables
Hint: Similar function
bar_plot <- function(d)
{
library(ggplot2)
for (i in 1:length(d))
{
if (!(is.numeric(d[[i]])))
{
print(ggplot(d, aes(x = d[[i]]))+
geom_bar()+
labs(x = names(d)[i]))
}
}
}
bar_plot(df)
Input: a data frame
Output: all possible the bar plots of a non-numeric variable filled by a non-numeric variable.
Hint: Similar function
bar_plot2 <- function(d)
{
library(ggplot2)
l <- length(d)
for (i in 1:(l-1))
for (j in (i+1):l)
{
if (!is.numeric(d[[i]])& (!is.numeric(d[[j]])))
{
print(ggplot(d, aes(x = d[[i]], color = d[[j]]))+
geom_bar()+labs(x = names(d)[i], color = names(d)[j]))
}
}
}
bar_plot2(df)
Input: a data frame
Output:
all possible the bar plots of a non-numeric variable filled by a non-numeric variable.
all possible the density plots of a numeric variable colored by a non-numeric variable
all possible the scatter plots.
Hint: Combine this function, this function, and the function in Question 4. One way to combine is creating a new function, quick_plot, and call these three functions within quic_kplot.
scatter_plot <- function(d)
{
library(ggplot2)
l <- length(d)
for (i in 1:(l-1))
for (j in (i+1):l)
{
if (is.numeric(d[[i]])&is.numeric(d[[j]]))
{
print(ggplot(d, aes(x = d[[i]], y = d[[j]]))+
geom_point()+
labs(x = names(d)[i], y = names(d)[j]))
}
}
}
density_plot2 <- function(d)
{
library(ggplot2)
l <- length(d)
for (i in 1:(l-1))
for (j in (i+1):l)
{
if (is.numeric(d[[i]])& (!is.numeric(d[[j]])))
{
print(ggplot(d, aes(x = d[[i]], color = d[[j]]))+
geom_density()+labs(x = names(d)[i], color = names(d)[j]))
}
}
}
bar_plot2 <- function(d)
{
library(ggplot2)
l <- length(d)
for (i in 1:(l-1))
for (j in (i+1):l)
{
if (!is.numeric(d[[i]])& (!is.numeric(d[[j]])))
{
print(ggplot(d, aes(x = d[[i]], color = d[[j]]))+
geom_bar()+labs(x = names(d)[i], color = names(d)[j]))
}
}
}
bar_plot2(df)
quick_plot <- function(d)
{
scatter_plot(d)
density_plot2(d)
bar_plot2(d)
}
quick_plot(df)